Load needed libraries
library(readr)
library(ggplot2)
library(dplyr)
library(caret)
library(glmnet)
library(boot)
library(tree)
library(ranger)
library(xgboost)
library(gbm)
library(vip)
library(ISLR)
library(tidyr)
library(gridExtra)
library(reshape2)
Set the seed for reproducibility
set.seed(1)
Load the dataset
original_lc_data <- read.csv("LCdata.csv",sep = ";")
lc_data <- original_lc_data
Remove attributes not available for prediction
lc_data <- subset(lc_data, select = -c(collection_recovery_fee, installment, issue_d,
last_pymnt_amnt, last_pymnt_d, loan_status,
next_pymnt_d, out_prncp, out_prncp_inv,
pymnt_plan, recoveries, total_pymnt,
total_pymnt_inv,total_rec_int, total_rec_late_fee,
total_rec_prncp))
summary(lc_data)
id member_id loan_amnt funded_amnt funded_amnt_inv term
Min. : 54734 Min. : 70473 Min. : 500 Min. : 500 Min. : 0 Length:798641
1st Qu.: 9207230 1st Qu.:10877939 1st Qu.: 8000 1st Qu.: 8000 1st Qu.: 8000 Class :character
Median :34433372 Median :37095300 Median :13000 Median :13000 Median :13000 Mode :character
Mean :32463636 Mean :35000265 Mean :14754 Mean :14741 Mean :14702
3rd Qu.:54900100 3rd Qu.:58470266 3rd Qu.:20000 3rd Qu.:20000 3rd Qu.:20000
Max. :68617057 Max. :73544841 Max. :35000 Max. :35000 Max. :35000
int_rate emp_title emp_length home_ownership annual_inc verification_status
Min. : 5.32 Length:798641 Length:798641 Length:798641 Min. : 0 Length:798641
1st Qu.: 9.99 Class :character Class :character Class :character 1st Qu.: 45000 Class :character
Median :12.99 Mode :character Mode :character Mode :character Median : 65000 Mode :character
Mean :13.24 Mean : 75014
3rd Qu.:16.20 3rd Qu.: 90000
Max. :28.99 Max. :9500000
NA's :4
url desc purpose title zip_code addr_state
Length:798641 Length:798641 Length:798641 Length:798641 Length:798641 Length:798641
Class :character Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character
dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record
Min. : 0.00 Min. : 0.0000 Length:798641 Min. : 0.0000 Min. : 0.0 Min. : 0.0
1st Qu.: 11.91 1st Qu.: 0.0000 Class :character 1st Qu.: 0.0000 1st Qu.: 15.0 1st Qu.: 51.0
Median : 17.66 Median : 0.0000 Mode :character Median : 0.0000 Median : 31.0 Median : 70.0
Mean : 18.16 Mean : 0.3145 Mean : 0.6947 Mean : 34.1 Mean : 70.1
3rd Qu.: 23.95 3rd Qu.: 0.0000 3rd Qu.: 1.0000 3rd Qu.: 50.0 3rd Qu.: 92.0
Max. :9999.00 Max. :39.0000 Max. :33.0000 Max. :188.0 Max. :129.0
NA's :25 NA's :25 NA's :408818 NA's :675190
open_acc pub_rec revol_bal revol_util total_acc initial_list_status
Min. : 0.00 Min. : 0.0000 Min. : 0 Min. : 0.00 Min. : 1.00 Length:798641
1st Qu.: 8.00 1st Qu.: 0.0000 1st Qu.: 6443 1st Qu.: 37.70 1st Qu.: 17.00 Class :character
Median :11.00 Median : 0.0000 Median : 11876 Median : 56.00 Median : 24.00 Mode :character
Mean :11.55 Mean : 0.1953 Mean : 16930 Mean : 55.05 Mean : 25.27
3rd Qu.:14.00 3rd Qu.: 0.0000 3rd Qu.: 20839 3rd Qu.: 73.50 3rd Qu.: 32.00
Max. :90.00 Max. :63.0000 Max. :2904836 Max. :892.30 Max. :169.00
NA's :25 NA's :25 NA's :2 NA's :454 NA's :25
last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type
Length:798641 Min. : 0.00000 Min. : 0.0 Min. :1 Length:798641
Class :character 1st Qu.: 0.00000 1st Qu.: 27.0 1st Qu.:1 Class :character
Mode :character Median : 0.00000 Median : 44.0 Median :1 Mode :character
Mean : 0.01447 Mean : 44.1 Mean :1
3rd Qu.: 0.00000 3rd Qu.: 61.0 3rd Qu.:1
Max. :20.00000 Max. :188.0 Max. :1
NA's :126 NA's :599107
annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal
Min. : 17950 Min. : 3.0 Length:798641 Min. : 0.000000 Min. : 0 Min. : 0
1st Qu.: 76167 1st Qu.:13.3 Class :character 1st Qu.: 0.000000 1st Qu.: 0 1st Qu.: 29861
Median :101886 Median :17.7 Mode :character Median : 0.000000 Median : 0 Median : 80647
Mean :110745 Mean :18.4 Mean : 0.005026 Mean : 228 Mean : 139508
3rd Qu.:133000 3rd Qu.:22.6 3rd Qu.: 0.000000 3rd Qu.: 0 3rd Qu.: 208229
Max. :500000 Max. :43.9 Max. :14.000000 Max. :9152545 Max. :8000078
NA's :798181 NA's :798183 NA's :25 NA's :63276 NA's :63276
open_acc_6m open_il_6m open_il_12m open_il_24m mths_since_rcnt_il total_bal_il
Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
1st Qu.: 0.0 1st Qu.: 1.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 6.0 1st Qu.: 10164
Median : 1.0 Median : 2.0 Median : 0.0 Median : 1.0 Median : 12.0 Median : 24544
Mean : 1.1 Mean : 2.9 Mean : 0.8 Mean : 1.7 Mean : 21.1 Mean : 36428
3rd Qu.: 2.0 3rd Qu.: 4.0 3rd Qu.: 1.0 3rd Qu.: 2.0 3rd Qu.: 23.0 3rd Qu.: 47640
Max. :14.0 Max. :33.0 Max. :12.0 Max. :19.0 Max. :363.0 Max. :878459
NA's :779525 NA's :779525 NA's :779525 NA's :779525 NA's :780030 NA's :779525
il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim
Min. : 0.0 Min. : 0.0 Min. : 0 Min. : 0 Min. : 0.0 Min. : 0
1st Qu.: 58.4 1st Qu.: 0.0 1st Qu.: 1 1st Qu.: 2406 1st Qu.: 47.6 1st Qu.: 13900
Median : 74.8 Median : 1.0 Median : 2 Median : 4502 Median : 61.9 Median : 23700
Mean : 71.5 Mean : 1.4 Mean : 3 Mean : 5878 Mean : 60.8 Mean : 32093
3rd Qu.: 87.7 3rd Qu.: 2.0 3rd Qu.: 4 3rd Qu.: 7774 3rd Qu.: 75.2 3rd Qu.: 39800
Max. :223.3 Max. :22.0 Max. :43 Max. :83047 Max. :151.4 Max. :9999999
NA's :782007 NA's :779525 NA's :779525 NA's :779525 NA's :779525 NA's :63276
inq_fi total_cu_tl inq_last_12m
Min. : 0.0 Min. : 0.0 Min. :-4
1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0
Median : 0.0 Median : 0.0 Median : 2
Mean : 0.9 Mean : 1.5 Mean : 2
3rd Qu.: 1.0 3rd Qu.: 2.0 3rd Qu.: 3
Max. :16.0 Max. :35.0 Max. :32
NA's :779525 NA's :779525 NA's :779525
First we delete the columns which aren’t useful for our prediction
lc_data$id <- NULL
lc_data$member_id <- NULL
lc_data$zip_code <- NULL
lc_data$url <- NULL
Looks like policy_code contains just value equal to 1, it can be removed
lc_data$policy_code <- NULL
Remove additional columns which are related to the historical data
lc_data$last_credit_pull_d <- NULL
Then we delete the columns which can’t be converted to categorical and require NLP
lc_data$title <- NULL
lc_data$desc <- NULL
lc_data$emp_title <- NULL
Let’s examine the loan_amnt column
sum(is.na(lc_data$loan_amnt))
[1] 0
cor(lc_data$loan_amnt, lc_data$int_rate)
[1] 0.1447189
hist(lc_data$loan_amnt, breaks = 20, main = "loan_amnt distribution", xlab = "loan_amnt", col = "lightblue", border = "black")
ggplot(data = lc_data, mapping = aes(x=int_rate,y=loan_amnt)) + geom_boxplot()
Standardize loan_amnt
#lc_data$loan_amnt <- scale(lc_data$loan_amnt)
Let’s examine the funded_amnt column
sum(is.na(lc_data$funded_amnt))
[1] 0
cor(lc_data$funded_amnt, lc_data$int_rate)
[1] 0.1448634
hist(lc_data$funded_amnt, breaks = 20, main = "funded_amnt distribution", xlab = "funded_amnt", col = "lightblue", border = "black")
As we can see, funded_amnt is almost the same as the loan_amnt column, consequently, we remove it.
lc_data$funded_amnt <- NULL
Let’s examine the funded_amnt_inv column
sum(is.na(lc_data$funded_amnt_inv))
[1] 0
cor(lc_data$funded_amnt_inv, lc_data$int_rate)
[1] 0.1449083
hist(lc_data$funded_amnt_inv, breaks = 20, main = "funded_amnt_inv distribution", xlab = "funded_amnt_inv", col = "lightblue", border = "black")
Remove funded_amnt_inv for the same reason as above
lc_data$funded_amnt_inv <- NULL
Let’s see the int_rate distribution.
hist(lc_data$int_rate, breaks = 20, main = "int_rate distribution", xlab = "int_rate", col = "lightblue", border = "black")
Standardize int rate:
#lc_data$int_rate <- scale(lc_data$int_rate)
As we can observe, there are 40363 NAs. We can assume 40363 do not work.
barplot(table(lc_data$emp_length),
xlab = "emp_length years",
ylab = "Frequency",
col = "skyblue",
border = "black",
cex.names = 0.6) # The size of the main title
Since emp_length seems to be categorical, we transform it to as a factor and then as numeric. The conversion to numeric is needed for supporting the Decision Trees and XGBoost
lc_data$emp_length <- as.factor(lc_data$emp_length)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=emp_length)) + geom_boxplot()
lc_data$emp_length <- as.numeric(lc_data$emp_length)
As we can see, term plays a crucial role in predicting the interest rate.
lc_data$term <- as.factor(lc_data$term)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=term)) + geom_boxplot()
lc_data$term <- as.numeric(lc_data$term)
Cleaning of home_ownership:
During the data cleaning phase, our analysis revealed that the variable “home_ownership” does not show a distinct correlation with interest rates. Specifically, among the categories, “ANY” and “OTHER” contain 2 and 154 cases, respectively, while the “NONE” category comprises 39 cases. Although the “NONE” category appears to demonstrate a higher interest rate compared to others, the limited sample size of 39 cases raises doubts about the reliability of this observation. Notably, the “NONE” category might pertain to individuals experiencing homelessness, prompting ethical concerns about loan provision to this demographic.
table(lc_data$home_ownership)
ANY MORTGAGE NONE OTHER OWN RENT
2 399151 45 155 78789 320499
ggplot(data = lc_data, mapping = aes(x=int_rate,y=home_ownership)) + geom_boxplot()
Then, we retain mortgage, own and rent:
lc_data <- lc_data %>% filter(home_ownership %in% c("MORTGAGE","OWN","RENT"))
lc_data$home_ownership <- as.numeric(as.factor(lc_data$home_ownership))
Most of the loan applications are Individual, this means that most of the values of the columns dti_joint,annual_inc_joint and verification_status_joint are Null. We would like to keep the information about Joint loans, this means that we can replace the Null values with 0.
nav <- c('', ' ')
lc_data <- transform(lc_data, verification_status_joint=replace(verification_status_joint, verification_status_joint %in% nav, NA))
lc_data <-
lc_data %>%
mutate(dti_joint = ifelse(is.na(dti_joint) == TRUE, 0, dti_joint)) %>%
mutate(annual_inc_joint = ifelse(is.na(annual_inc_joint) == TRUE, 0, annual_inc_joint)) %>%
mutate(verification_status_joint = ifelse(is.na(verification_status_joint) == TRUE, 'NA', verification_status_joint))
The empty string or null value in verification_status_joint is replaced successfully.
table(lc_data$verification_status)
Not Verified Source Verified Verified
240255 296631 261553
table(lc_data$verification_status_joint)
NA Not Verified Source Verified Verified
797979 253 53 154
Then verification_status_joint and verification_status columns are converted in categorical and then numerical value. The column application_type is obsolete, since the information about whether the loan is individual or joint is already contained in the previous variables.
lc_data$verification_status <- as.numeric(as.factor(lc_data$verification_status))
lc_data$verification_status_joint <- as.numeric(as.factor(lc_data$verification_status_joint))
lc_data <- lc_data %>% select(-application_type)
Let’s check if other is NA or a real value for purpose. It’s a real one, so we don’t have to handle it.
lc_data$purpose <- as.factor(lc_data$purpose)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=purpose)) + geom_boxplot()
lc_data$purpose <- as.numeric(lc_data$purpose)
Let’s have a glance to the state address:
table(lc_data$addr_state)
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS
1992 10101 5953 18359 116578 16934 12154 2188 2268 54819 26146 4112 13 11 31880 12393 7105
KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV
7726 9498 18546 18906 469 20678 14306 12821 3455 2286 22135 431 1064 3865 29991 4428 11155
NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY
66790 26682 7266 9806 28221 3499 9609 1615 11618 63982 5629 23616 1606 17470 10446 3977 1841
lc_data$addr_state <- as.factor(lc_data$addr_state)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=addr_state)) + geom_boxplot()
lc_data$addr_state <- as.numeric(lc_data$addr_state)
Regarding delinquency in the last 2 years, there are few NAs then remove them:
lc_data <- lc_data %>%
filter(!(is.na(delinq_2yrs)))
The columns mths_since_delinq_cat, mths_since_last_record, mths_since_rcnt_il and mths_since_last_major_derog contain numerical values which refer to the number of the months. Since this columns contain a lot of null values which can’t be replaced with 0’s, one of the most appropriate operations that can be made is applying discretization. We do this by creating a set of contiguous bins based on years, while for the null values we create a separate bin.
lc_data <- lc_data %>%
mutate(mths_since_delinq_cat = ifelse(
is.na(mths_since_last_delinq) == TRUE,
"NONE",
ifelse(
mths_since_last_delinq <= 12,
"Less_1_Y",
ifelse(
mths_since_last_delinq <= 24,
"Less_2_Y",
ifelse(
mths_since_last_delinq <= 36,
"Less_3_Y",
ifelse(mths_since_last_delinq <= 48, "Less_4_Y", "More_4_Y")
)
)
)
)) %>% select(-mths_since_last_delinq)
lc_data$mths_since_delinq_cat <- as.factor(lc_data$mths_since_delinq_cat)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=mths_since_delinq_cat))+geom_boxplot()
lc_data$mths_since_delinq_cat <- as.numeric(lc_data$mths_since_delinq_cat)
lc_data <- lc_data %>%
mutate(mths_since_last_record_cat = ifelse(
is.na(mths_since_last_record) == TRUE,
"NONE",
ifelse(
mths_since_last_record <= 12,
"Less_1_Y",
ifelse(
mths_since_last_record <= 24,
"Less_2_Y",
ifelse(
mths_since_last_record <= 36,
"Less_3_Y",
ifelse(mths_since_last_record <= 48, "Less_4_Y", "More_4_Y")
)
)
)
)) %>% select(-mths_since_last_record)
lc_data$mths_since_last_record_cat <- as.factor(lc_data$mths_since_last_record_cat)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=mths_since_last_record_cat))+geom_boxplot()
lc_data$mths_since_last_record_cat <- as.numeric(lc_data$mths_since_last_record_cat)
lc_data <-lc_data %>%
mutate(mths_since_rcnt_il_cat = ifelse(
is.na(mths_since_rcnt_il) == TRUE,
"NONE",
ifelse(
mths_since_rcnt_il <= 12,
"Less_1_Y",
ifelse(
mths_since_rcnt_il <= 24,
"Less_2_Y",
ifelse(
mths_since_rcnt_il <= 36,
"Less_3_Y",
ifelse(mths_since_rcnt_il <= 48, "Less_4_Y", "More_4_Y")
)
)
)
)) %>% select(-mths_since_rcnt_il)
lc_data$mths_since_rcnt_il_cat <- as.factor(lc_data$mths_since_rcnt_il_cat)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=mths_since_rcnt_il_cat))+geom_boxplot()
lc_data$mths_since_rcnt_il_cat <- as.numeric(lc_data$mths_since_rcnt_il_cat)
lc_data <-lc_data %>%
mutate(mths_since_last_major_derog_cat = ifelse(
is.na(mths_since_last_major_derog) == TRUE,
"NONE",
ifelse(
mths_since_last_major_derog <= 12,
"Less_1_Y",
ifelse(
mths_since_last_major_derog <= 24,
"Less_2_Y",
ifelse(
mths_since_last_major_derog <= 36,
"Less_3_Y",
ifelse(mths_since_last_major_derog <= 48, "Less_4_Y", "More_4_Y")
)
)
)
)) %>% select(-mths_since_last_major_derog)
lc_data$mths_since_last_major_derog_cat <- as.factor(lc_data$mths_since_last_major_derog_cat)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=mths_since_last_major_derog_cat))+geom_boxplot()
lc_data$mths_since_last_major_derog_cat <- as.numeric(lc_data$mths_since_last_major_derog_cat)
The variable initial_list_status identifies whether a loan was initially listed in the whole (W) or fractional (F) market. This variable could be useful so we can keep it and transform it to a factor and then to a numeric value, for the same purpose of compatibility with the XGBoost function.
lc_data$initial_list_status <- as.factor(lc_data$initial_list_status)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=initial_list_status))+geom_boxplot()
lc_data$initial_list_status <- as.numeric(lc_data$initial_list_status)
Let’s check which columns still have null values
colSums(is.na(lc_data))
loan_amnt term int_rate
0 0 0
emp_length home_ownership annual_inc
0 0 0
verification_status purpose addr_state
0 0 0
dti delinq_2yrs earliest_cr_line
0 0 0
inq_last_6mths open_acc pub_rec
0 0 0
revol_bal revol_util total_acc
2 428 0
initial_list_status collections_12_mths_ex_med annual_inc_joint
0 99 0
dti_joint verification_status_joint acc_now_delinq
0 0 0
tot_coll_amt tot_cur_bal open_acc_6m
63132 63132 779302
open_il_6m open_il_12m open_il_24m
779302 779302 779302
total_bal_il il_util open_rv_12m
779302 781784 779302
open_rv_24m max_bal_bc all_util
779302 779302 779302
total_rev_hi_lim inq_fi total_cu_tl
63132 779302 779302
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
779302 0 0
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
0 0
The columns revol_bal and revol_util contain only few NA values, those values can’t be replaced with 0, then we filter the values which are not NAs.
lc_data <- lc_data %>%
filter(!(is.na(revol_bal))) %>%
filter(!(is.na(revol_util)))
Let’s check which columns still have null values:
names(which(colSums(is.na(lc_data)) > 0))
[1] "collections_12_mths_ex_med" "tot_coll_amt" "tot_cur_bal" "open_acc_6m"
[5] "open_il_6m" "open_il_12m" "open_il_24m" "total_bal_il"
[9] "il_util" "open_rv_12m" "open_rv_24m" "max_bal_bc"
[13] "all_util" "total_rev_hi_lim" "inq_fi" "total_cu_tl"
[17] "inq_last_12m"
Replace null values with 0 where is possible
lc_data <-
lc_data %>%
mutate(open_acc_6m = ifelse(is.na(open_acc_6m) == TRUE, 0, open_acc_6m)) %>%
mutate(tot_cur_bal = ifelse(is.na(tot_cur_bal) == TRUE, 0, tot_cur_bal)) %>%
mutate(open_il_6m = ifelse(is.na(open_il_6m) == TRUE, 0, open_il_6m)) %>%
mutate(open_il_12m = ifelse(is.na(open_il_12m) == TRUE, 0, open_il_12m)) %>%
mutate(open_il_24m = ifelse(is.na(open_il_24m) == TRUE, 0, open_il_24m)) %>%
mutate(total_bal_il = ifelse(is.na(total_bal_il) == TRUE, 0, total_bal_il)) %>%
mutate(il_util = ifelse(is.na(il_util) == TRUE, 0, il_util)) %>%
mutate(open_rv_12m = ifelse(is.na(open_rv_12m) == TRUE, 0, open_rv_12m)) %>%
mutate(total_rev_hi_lim = ifelse(is.na(total_rev_hi_lim) == TRUE, 0, total_rev_hi_lim)) %>%
mutate(max_bal_bc = ifelse(is.na(max_bal_bc) == TRUE, 0, max_bal_bc)) %>%
mutate(all_util = ifelse(is.na(all_util) == TRUE, 0, all_util)) %>%
mutate(inq_fi = ifelse(is.na(inq_fi) == TRUE, 0, inq_fi)) %>%
mutate(total_cu_tl = ifelse(is.na(total_cu_tl) == TRUE, 0, total_cu_tl)) %>%
mutate(inq_last_12m = ifelse(is.na(inq_last_12m) == TRUE, 0, inq_last_12m)) %>%
mutate(open_rv_24m = ifelse(is.na(open_rv_24m) == TRUE, 0, open_rv_24m)) %>%
mutate(tot_coll_amt = ifelse(is.na(tot_coll_amt)== TRUE,0, tot_coll_amt)) %>%
mutate(collections_12_mths_ex_med = ifelse(is.na(collections_12_mths_ex_med)== TRUE,0, collections_12_mths_ex_med))
earliest_cr_line contains the month the borrower’s earliest reported credit line was opened. Even if this date consists only on month and year, still there are too many unique values. We could transform the dates in to a numerical value, by converting them from date into Unix Time. This unit measures time by the number of seconds that have elapsed since 00:00:00 UTC on 1 January 1970. Since this column doesn’t contain the day number, we take as a reference the first day of the month.
lc_data <- lc_data %>%
filter(!(is.na(earliest_cr_line)))
# function to replace dates with unix time
to_unix_time <- function(date) {
tmp <- paste("01", date, sep="-")
return (as.numeric(as.POSIXct(tmp, format="%d-%b-%Y", tz="UTC")))
}
# map dates to unix time
lc_data$earliest_cr_line <- apply(lc_data, 1, function(row) to_unix_time(row["earliest_cr_line"]))
# standardize them
#lc_data$earliest_cr_line <- scale(lc_data$earliest_cr_line)
Outliers Removal:
boxplot(lc_data$int_rate)
# Identify outliers using boxplot
outliers <- boxplot(lc_data$int_rate, plot = FALSE)$out
# Remove outliers from the dataset
lc_data <- lc_data[!lc_data$int_rate %in% outliers, ]
summary(lc_data)
loan_amnt term int_rate emp_length home_ownership annual_inc verification_status
Min. : 500 Min. :1.000 Min. : 5.32 Min. : 1.00 Min. :1.000 Min. : 0 Min. :1.000
1st Qu.: 8000 1st Qu.:1.000 1st Qu.: 9.99 1st Qu.: 3.00 1st Qu.:1.000 1st Qu.: 45000 1st Qu.:1.000
Median :13000 Median :1.000 Median :12.99 Median : 4.00 Median :1.000 Median : 65000 Median :2.000
Mean :14718 Mean :1.296 Mean :13.15 Mean : 5.11 Mean :1.901 Mean : 75007 Mean :2.023
3rd Qu.:20000 3rd Qu.:2.000 3rd Qu.:15.95 3rd Qu.: 7.00 3rd Qu.:3.000 3rd Qu.: 90000 3rd Qu.:3.000
Max. :35000 Max. :2.000 Max. :25.28 Max. :12.00 Max. :3.000 Max. :9500000 Max. :3.000
purpose addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths
Min. : 1.000 Min. : 1.00 Min. : 0.00 Min. : 0.0000 Min. :-820540800 Min. : 0.0000
1st Qu.: 3.000 1st Qu.:10.00 1st Qu.: 11.90 1st Qu.: 0.0000 1st Qu.: 770428800 1st Qu.: 0.0000
Median : 3.000 Median :24.00 Median : 17.64 Median : 0.0000 Median : 933465600 Median : 0.0000
Mean : 3.562 Mean :24.14 Mean : 18.15 Mean : 0.3138 Mean : 888844650 Mean : 0.6899
3rd Qu.: 3.000 3rd Qu.:37.00 3rd Qu.: 23.93 3rd Qu.: 0.0000 3rd Qu.:1051747200 3rd Qu.: 1.0000
Max. :14.000 Max. :51.00 Max. :9999.00 Max. :39.0000 Max. :1351728000 Max. :33.0000
open_acc pub_rec revol_bal revol_util total_acc initial_list_status
Min. : 1.00 Min. : 0.000 Min. : 0 Min. : 0.00 Min. : 1.00 Min. :1.000
1st Qu.: 8.00 1st Qu.: 0.000 1st Qu.: 6456 1st Qu.: 37.60 1st Qu.: 17.00 1st Qu.:1.000
Median :11.00 Median : 0.000 Median : 11888 Median : 55.90 Median : 24.00 Median :1.000
Mean :11.55 Mean : 0.195 Mean : 16943 Mean : 55.02 Mean : 25.27 Mean :1.486
3rd Qu.:14.00 3rd Qu.: 0.000 3rd Qu.: 20849 3rd Qu.: 73.50 3rd Qu.: 32.00 3rd Qu.:2.000
Max. :90.00 Max. :63.000 Max. :2904836 Max. :892.30 Max. :169.00 Max. :2.000
collections_12_mths_ex_med annual_inc_joint dti_joint verification_status_joint acc_now_delinq
Min. : 0.00000 Min. : 0.0 Min. : 0.00000 Min. :1.000 Min. : 0.00000
1st Qu.: 0.00000 1st Qu.: 0.0 1st Qu.: 0.00000 1st Qu.:1.000 1st Qu.: 0.00000
Median : 0.00000 Median : 0.0 Median : 0.00000 Median :1.000 Median : 0.00000
Mean : 0.01444 Mean : 63.2 Mean : 0.01032 Mean :1.001 Mean : 0.00498
3rd Qu.: 0.00000 3rd Qu.: 0.0 3rd Qu.: 0.00000 3rd Qu.:1.000 3rd Qu.: 0.00000
Max. :20.00000 Max. :500000.0 Max. :43.86000 Max. :4.000 Max. :14.00000
tot_coll_amt tot_cur_bal open_acc_6m open_il_6m open_il_12m open_il_24m
Min. : 0 Min. : 0 Min. : 0.0000 Min. : 0.00000 Min. : 0.00000 Min. : 0.00000
1st Qu.: 0 1st Qu.: 23128 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.00000
Median : 0 Median : 65355 Median : 0.0000 Median : 0.00000 Median : 0.00000 Median : 0.00000
Mean : 210 Mean : 128454 Mean : 0.0263 Mean : 0.06979 Mean : 0.01803 Mean : 0.03974
3rd Qu.: 0 3rd Qu.: 195918 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.00000
Max. :9152545 Max. :8000078 Max. :14.0000 Max. :33.00000 Max. :12.00000 Max. :15.00000
total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util
Min. : 0.0 Min. : 0.000 Min. : 0.00000 Min. : 0.00000 Min. : 0.0 Min. : 0.000
1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.0 1st Qu.: 0.000
Median : 0.0 Median : 0.000 Median : 0.00000 Median : 0.00000 Median : 0.0 Median : 0.000
Mean : 872.1 Mean : 1.488 Mean : 0.03307 Mean : 0.07102 Mean : 141.1 Mean : 1.456
3rd Qu.: 0.0 3rd Qu.: 0.000 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.0 3rd Qu.: 0.000
Max. :878459.0 Max. :200.000 Max. :22.00000 Max. :43.00000 Max. :83047.0 Max. :151.400
total_rev_hi_lim inq_fi total_cu_tl inq_last_12m mths_since_delinq_cat
Min. : 0 Min. : 0.00000 Min. : 0.00000 Min. :-4.00000 Min. :1.000
1st Qu.: 11700 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.:3.000
Median : 21800 Median : 0.00000 Median : 0.00000 Median : 0.00000 Median :6.000
Mean : 29595 Mean : 0.02251 Mean : 0.03675 Mean : 0.04707 Mean :4.577
3rd Qu.: 37900 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.:6.000
Max. :9999999 Max. :16.00000 Max. :35.00000 Max. :32.00000 Max. :6.000
mths_since_last_record_cat mths_since_rcnt_il_cat mths_since_last_major_derog_cat
Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:6.000 1st Qu.:6.000 1st Qu.:6.000
Median :6.000 Median :6.000 Median :6.000
Mean :5.779 Mean :5.906 Mean :5.436
3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000
Max. :6.000 Max. :6.000 Max. :6.000
Learning Algorithms
# create indices for splitting (80% train, 20% test)
train_indices <- createDataPartition(lc_data$int_rate, p = 0.8, list = FALSE)
# create training and testing datasets
train_data <- lc_data[train_indices, ]
test_data <- lc_data[-train_indices, ]
#### Linear Regression ####
lm.fit <- lm(int_rate ~ ., data = train_data)
# make predictions on the training and testing data
lm.train_predictions <- predict(lm.fit, newdata = train_data)
lm.test_predictions <- predict(lm.fit, newdata = test_data)
# calculate Mean Squared Error (MSE) for training and testing
lm.train_mse <- mean((lm.train_predictions - train_data$int_rate)^2)
lm.test_mse <- mean((lm.test_predictions - test_data$int_rate)^2)
# calculate Root Mean Squared Error (RMSE) for training and testing
lm.train_rmse <- sqrt(lm.train_mse)
lm.test_rmse <- sqrt(lm.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
lm.train_mae <- mean(abs(lm.train_predictions - train_data$int_rate))
lm.test_mae <- mean(abs(lm.test_predictions - test_data$int_rate))
# calculate R-squared (R²) for training and testing
lm.train_r2 <- 1 - (sum((train_data$int_rate - lm.train_predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
lm.test_r2 <- 1 - (sum((test_data$int_rate - lm.test_predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
# display the metrics
cat("Training MSE:", lm.train_mse, "\n")
Training MSE: 10.33743
cat("Testing MSE:", lm.test_mse, "\n")
Testing MSE: 10.76845
cat("Training RMSE:", lm.train_rmse, "\n")
Training RMSE: 3.215187
cat("Testing RMSE:", lm.test_rmse, "\n")
Testing RMSE: 3.281531
cat("Training MAE:", lm.train_mae, "\n")
Training MAE: 2.560509
cat("Testing MAE:", lm.test_mae, "\n")
Testing MAE: 2.563904
cat("Training R-squared (R²):", lm.train_r2, "\n")
Training R-squared (R²): 0.4299502
cat("Testing R-squared (R²):", lm.test_r2, "\n")
Testing R-squared (R²): 0.4069667
Lasso It standardizes data automatically
lasso.predictors_train <- model.matrix(int_rate ~ ., train_data)[,-1]
lasso.target_train <- train_data$int_rate
lasso.predictors_test <- model.matrix(int_rate ~ ., test_data)[,-1]
lasso.target_test <- test_data$int_rate
lasso.fit <- glmnet(lasso.predictors_train, lasso.target_train, alpha = 1)
plot(lasso.fit, label=TRUE)
# make predictions on the training and testing data
lasso.train_predictions <- predict(lasso.fit, newdata = train_data, newx = lasso.predictors_train)
lasso.test_predictions <- predict(lasso.fit, newdata = test_data, newx = lasso.predictors_train)
# calculate Mean Squared Error (MSE) for training and testing
lasso.train_mse <- mean((lasso.train_predictions - train_data$int_rate)^2)
lasso.test_mse <- mean((lasso.test_predictions - test_data$int_rate)^2)
Warning: longer object length is not a multiple of shorter object length
# calculate Root Mean Squared Error (RMSE) for training and testing
lasso.train_rmse <- sqrt(lasso.train_mse)
lasso.test_rmse <- sqrt(lasso.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
lasso.train_mae <- mean(abs(lasso.train_predictions - train_data$int_rate))
lasso.test_mae <- mean(abs(lasso.test_predictions - test_data$int_rate))
Warning: longer object length is not a multiple of shorter object length
# calculate R-squared (R²) for training and testing
lasso.train_r2 <- 1 - (sum((train_data$int_rate - lasso.train_predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
lasso.test_r2 <- 1 - (sum((test_data$int_rate - lasso.test_predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
Warning: longer object length is not a multiple of shorter object length
# display the metrics
cat("Training MSE:", lasso.train_mse, "\n")
Training MSE: 11.57067
cat("Testing MSE:", lasso.test_mse, "\n")
Testing MSE: 23.50268
cat("Training RMSE:", lasso.train_rmse, "\n")
Training RMSE: 3.401569
cat("Testing RMSE:", lasso.test_rmse, "\n")
Testing RMSE: 4.847956
cat("Training MAE:", lasso.train_mae, "\n")
Training MAE: 2.721616
cat("Testing MAE:", lasso.test_mae, "\n")
Testing MAE: 3.911493
cat("Training R-squared (R²):", lasso.train_r2, "\n")
Training R-squared (R²): -46.21615
cat("Testing R-squared (R²):", lasso.test_r2, "\n")
Testing R-squared (R²): -382.1256
Ridge It standardizes data automatically
ridge.predictors_train <- model.matrix(int_rate ~ ., train_data)[,-1]
ridge.target_train <- train_data$int_rate
ridge.predictors_test <- model.matrix(int_rate ~ ., test_data)[,-1]
ridge.target_test <- test_data$int_rate
ridge.fit <- glmnet(ridge.predictors_train, ridge.target_train, alpha = 0)
plot(ridge.fit, label=TRUE, xlab = "L2 Norm")
# make predictions on the training and testing data
ridge.train_predictions <- predict(ridge.fit, newdata = train_data, newx = ridge.predictors_train)
ridge.test_predictions <- predict(ridge.fit, newdata = test_data, newx = ridge.predictors_train)
# calculate Mean Squared Error (MSE) for training and testing
ridge.train_mse <- mean((ridge.train_predictions - train_data$int_rate)^2)
ridge.test_mse <- mean((ridge.test_predictions - test_data$int_rate)^2)
Warning: longer object length is not a multiple of shorter object length
# calculate Root Mean Squared Error (RMSE) for training and testing
ridge.train_rmse <- sqrt(ridge.train_mse)
ridge.test_rmse <- sqrt(ridge.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
ridge.train_mae <- mean(abs(ridge.train_predictions - train_data$int_rate))
ridge.test_mae <- mean(abs(ridge.test_predictions - test_data$int_rate))
Warning: longer object length is not a multiple of shorter object length
# calculate R-squared (R²) for training and testing
ridge.train_r2 <- 1 - (sum((train_data$int_rate - ridge.train_predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
ridge.test_r2 <- 1 - (sum((test_data$int_rate - ridge.test_predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
Warning: longer object length is not a multiple of shorter object length
# display the metrics
cat("Training MSE:", ridge.train_mse, "\n")
Training MSE: 14.52168
cat("Testing MSE:", ridge.test_mse, "\n")
Testing MSE: 20.06712
cat("Training RMSE:", ridge.train_rmse, "\n")
Training RMSE: 3.810732
cat("Testing RMSE:", ridge.test_rmse, "\n")
Testing RMSE: 4.479634
cat("Training MAE:", ridge.train_mae, "\n")
Training MAE: 3.053025
cat("Testing MAE:", ridge.test_mae, "\n")
Testing MAE: 3.612791
cat("Training R-squared (R²):", ridge.train_r2, "\n")
Training R-squared (R²): -79.07873
cat("Testing R-squared (R²):", ridge.test_r2, "\n")
Testing R-squared (R²): -441.0558
K fold using K=5:
# define the number of folds for cross-validation
num_folds <- 5
folds <- createFolds(train_data$int_rate, k = num_folds, list = TRUE)
K fold using K=5 and linear regression:
#### Linear Regresion applying Cross Validation with k=5 ####
# initialize lists to store models and their results
lm.k5.models <- list()
lm.k5.results <- data.frame()
# perform k-fold cross-validation
for(i in seq_along(folds)) {
# split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# fit the model on the training fold
lm.k5 <- lm(int_rate ~ ., data = train_data_fold)
lm.k5.models[[i]] <- lm.k5 # Store the model
# make predictions on the training and testing fold
lm.k5.train_predictions <- predict(lm.k5, newdata = train_data_fold)
lm.k5.test_predictions <- predict(lm.k5, newdata = test_data_fold)
# calculate metrics for training fold
lm.k5.train_mse <- mean((lm.k5.train_predictions - train_data_fold$int_rate)^2)
lm.k5.train_rmse <- sqrt(lm.k5.train_mse)
lm.k5.train_mae <- mean(abs(lm.k5.train_predictions - train_data_fold$int_rate))
lm.k5.train_r2 <- summary(lm.k5)$r.squared
# calculate metrics for testing fold
lm.k5.test_mse <- mean((lm.k5.test_predictions - test_data_fold$int_rate)^2)
lm.k5.test_rmse <- sqrt(lm.k5.test_mse)
lm.k5.test_mae <- mean(abs(lm.k5.test_predictions - test_data_fold$int_rate))
lm.k5.test_r2 <- 1 - (sum((test_data_fold$int_rate - lm.k5.test_predictions)^2) / sum((test_data_fold$int_rate - mean(test_data_fold$int_rate))^2))
# store metrics in the results dataframe
lm.k5.results <- rbind(lm.k5.results, data.frame(
Fold = i,
Train_MSE = lm.k5.train_mse, Test_MSE = lm.k5.test_mse,
Train_RMSE = lm.k5.train_rmse, Test_RMSE = lm.k5.test_rmse,
Train_MAE = lm.k5.train_mae, Test_MAE = lm.k5.test_mae,
Train_R2 = lm.k5.train_r2, Test_R2 = lm.k5.test_r2
))
}
# display the models and their metrics
print(lm.k5.models)
[[1]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
1.687e+00 2.765e-05 3.820e+00
emp_length home_ownership annual_inc
1.312e-02 2.663e-01 -6.405e-06
verification_status purpose addr_state
7.579e-01 3.114e-01 7.812e-04
dti delinq_2yrs earliest_cr_line
3.244e-03 2.937e-02 1.833e-09
inq_last_6mths open_acc pub_rec
9.535e-01 7.908e-02 4.089e-01
revol_bal revol_util total_acc
6.745e-06 4.683e-02 -3.022e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-9.840e-01 3.447e-01 -1.505e-06
dti_joint verification_status_joint acc_now_delinq
-5.085e-02 6.851e-01 1.370e+00
tot_coll_amt tot_cur_bal open_acc_6m
4.343e-05 -1.083e-06 6.276e-02
open_il_6m open_il_12m open_il_24m
-1.738e-01 6.270e-01 1.428e-01
total_bal_il il_util open_rv_12m
4.173e-06 7.194e-03 1.974e-01
open_rv_24m max_bal_bc all_util
3.619e-02 -6.566e-05 -2.253e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.496e-05 -3.121e-02 -3.205e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
1.041e-01 -1.892e-01 -1.256e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
3.029e-01 -1.474e-01
[[2]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.496e+00 3.260e-05 3.738e+00
emp_length home_ownership annual_inc
1.790e-02 2.413e-01 -3.583e-06
verification_status purpose addr_state
6.970e-01 3.238e-01 2.388e-04
dti delinq_2yrs earliest_cr_line
5.065e-02 4.697e-02 1.832e-09
inq_last_6mths open_acc pub_rec
9.412e-01 6.030e-02 3.292e-01
revol_bal revol_util total_acc
2.945e-06 4.295e-02 -3.410e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.026e+00 3.403e-01 -1.607e-05
dti_joint verification_status_joint acc_now_delinq
5.920e-02 4.294e-01 1.259e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.413e-05 -1.093e-06 -2.077e-02
open_il_6m open_il_12m open_il_24m
-1.338e-01 7.100e-01 1.539e-02
total_bal_il il_util open_rv_12m
3.967e-06 3.744e-03 1.603e-01
open_rv_24m max_bal_bc all_util
4.186e-02 -6.518e-05 -3.778e-04
total_rev_hi_lim inq_fi total_cu_tl
-1.505e-05 2.988e-02 -4.994e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
9.086e-02 -1.958e-01 -2.155e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.527e-01 -1.356e-01
[[3]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.289e+00 2.713e-05 3.754e+00
emp_length home_ownership annual_inc
1.088e-02 2.510e-01 -5.645e-06
verification_status purpose addr_state
7.132e-01 3.364e-01 -5.472e-04
dti delinq_2yrs earliest_cr_line
4.722e-02 2.672e-02 1.911e-09
inq_last_6mths open_acc pub_rec
9.720e-01 5.061e-02 3.960e-01
revol_bal revol_util total_acc
-3.466e-06 4.658e-02 -3.297e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.054e+00 3.183e-01 -1.479e-05
dti_joint verification_status_joint acc_now_delinq
5.008e-02 2.727e-01 1.299e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.562e-05 -1.092e-06 4.777e-02
open_il_6m open_il_12m open_il_24m
-1.469e-01 7.745e-01 -2.142e-02
total_bal_il il_util open_rv_12m
3.063e-07 5.769e-03 1.045e-01
open_rv_24m max_bal_bc all_util
8.413e-02 -4.547e-05 -3.304e-03
total_rev_hi_lim inq_fi total_cu_tl
-4.083e-06 -1.724e-02 -2.834e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
9.800e-02 -2.113e-01 -2.008e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.711e-01 -1.203e-01
[[4]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
1.366e+00 2.851e-05 3.795e+00
emp_length home_ownership annual_inc
2.084e-02 2.826e-01 -5.818e-06
verification_status purpose addr_state
7.511e-01 3.187e-01 1.663e-03
dti delinq_2yrs earliest_cr_line
4.008e-03 5.161e-02 1.823e-09
inq_last_6mths open_acc pub_rec
9.358e-01 8.252e-02 4.214e-01
revol_bal revol_util total_acc
7.686e-06 4.530e-02 -2.953e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-9.783e-01 3.079e-01 -2.172e-05
dti_joint verification_status_joint acc_now_delinq
2.041e-02 1.316e+00 1.066e+00
tot_coll_amt tot_cur_bal open_acc_6m
1.879e-05 -1.163e-06 -6.310e-02
open_il_6m open_il_12m open_il_24m
-1.750e-01 8.969e-01 -1.558e-02
total_bal_il il_util open_rv_12m
4.716e-06 3.640e-03 2.315e-01
open_rv_24m max_bal_bc all_util
3.048e-02 -3.883e-05 -4.492e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.707e-05 1.089e-01 -6.896e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
6.442e-02 -1.882e-01 -1.144e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.220e-01 -1.363e-01
[[5]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.001e+00 2.174e-05 3.796e+00
emp_length home_ownership annual_inc
1.740e-02 2.294e-01 -3.435e-06
verification_status purpose addr_state
7.323e-01 3.353e-01 -3.753e-04
dti delinq_2yrs earliest_cr_line
4.920e-02 3.258e-02 1.922e-09
inq_last_6mths open_acc pub_rec
9.635e-01 4.787e-02 4.137e-01
revol_bal revol_util total_acc
-5.206e-06 4.752e-02 -3.152e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.013e+00 2.512e-01 -2.362e-06
dti_joint verification_status_joint acc_now_delinq
4.135e-02 -1.095e-01 1.309e+00
tot_coll_amt tot_cur_bal open_acc_6m
3.261e-05 -1.700e-06 -7.970e-02
open_il_6m open_il_12m open_il_24m
-1.165e-01 7.144e-01 1.163e-01
total_bal_il il_util open_rv_12m
1.046e-06 5.671e-03 3.601e-01
open_rv_24m max_bal_bc all_util
1.314e-02 -4.553e-05 -4.432e-03
total_rev_hi_lim inq_fi total_cu_tl
-3.388e-06 6.541e-02 -3.427e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
3.553e-02 -2.122e-01 -1.419e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
3.005e-01 -1.368e-01
print(lm.k5.results)
plot_metric <- function(results_long, metric) {
# adjust the variable names based on the metric
variables <- if (metric == "OOB") {
"OOB_Error"
} else {
c(paste0('Train_', metric), paste0('Test_', metric))
}
title <- if (metric == "OOB") {
paste0(metric, ' per Fold')
} else {
paste0('Train vs Test ', metric, ' per Fold')
}
ggplot(results_long[results_long$variable %in% variables, ],
aes(x = Fold, y = value, color = variable)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = title,
x = 'Fold',
y = metric)
}
# reshape data for plotting
lm.k5.results_long <- melt(lm.k5.results, id.vars = 'Fold')
# plot for each metric
p1 <- plot_metric(lm.k5.results_long, 'MSE')
p2 <- plot_metric(lm.k5.results_long, 'RMSE')
p3 <- plot_metric(lm.k5.results_long, 'MAE')
p4 <- plot_metric(lm.k5.results_long, 'R2')
# arrange the plots in a 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
plot(p1)
plot(p2)
plot(p3)
plot(p4)
K fold using K=5 and Random Forest:
#### Random Forest applying Cross Validation with k=5 ####
# initialize lists to store models and their results
rf.k5.models <- list()
rf.k5.results <- data.frame()
# perform k-fold cross-validation
for(i in seq_along(folds)) {
# split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# fit the model on the training fold
rf.k5 <- ranger(formula = int_rate ~ ., data = train_data_fold, num.trees = 500, verbose=TRUE, importance = "impurity", oob.error = TRUE)
rf.k5.models[[i]] <- rf.k5 # Store the model
# make predictions on the training and testing fold
rf.k5.train_predictions <- predict(rf.k5, data = train_data_fold)$predictions
rf.k5.test_predictions <- predict(rf.k5, data = test_data_fold)$predictions
# calculate metrics for training fold
rf.k5.train_mse <- mean((rf.k5.train_predictions - train_data_fold$int_rate)^2)
rf.k5.train_rmse <- sqrt(rf.k5.train_mse)
rf.k5.train_mae <- mean(abs(rf.k5.train_predictions - train_data_fold$int_rate))
rf.k5.oob_error <- rf.k5$prediction.error
# calculate metrics for testing fold
rf.k5.test_mse <- mean((rf.k5.test_predictions - test_data_fold$int_rate)^2)
rf.k5.test_rmse <- sqrt(rf.k5.test_mse)
rf.k5.test_mae <- mean(abs(rf.k5.test_predictions - test_data_fold$int_rate))
rf.k5.test_r2 <- 1 - (sum((test_data_fold$int_rate - rf.k5.test_predictions)^2) / sum((test_data_fold$int_rate - mean(test_data_fold$int_rate))^2))
# store metrics in the results dataframe
rf.k5.results <- rbind(rf.k5.results, data.frame(
Fold = i,
Train_MSE = rf.k5.train_mse, Test_MSE = rf.k5.test_mse,
Train_RMSE = rf.k5.train_rmse, Test_RMSE = rf.k5.test_rmse,
Train_MAE = rf.k5.train_mae, Test_MAE = rf.k5.test_mae,
OOB_Error = rf.k5.oob_error
))
}
Growing trees.. Progress: 97%. Estimated remaining time: 0 seconds.
# display the models and their metrics
print(rf.k5.models)
[[1]]
Ranger result
Call:
ranger(formula = int_rate ~ ., data = train_data_fold, num.trees = 500, verbose = TRUE, importance = "impurity", oob.error = TRUE)
Type: Regression
Number of trees: 500
Sample size: 126773
Number of independent variables: 43
Mtry: 6
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 8.681077
R squared (OOB): 0.52044
[[2]]
Ranger result
Call:
ranger(formula = int_rate ~ ., data = train_data_fold, num.trees = 500, verbose = TRUE, importance = "impurity", oob.error = TRUE)
Type: Regression
Number of trees: 500
Sample size: 126774
Number of independent variables: 43
Mtry: 6
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 8.752434
R squared (OOB): 0.5164565
[[3]]
Ranger result
Call:
ranger(formula = int_rate ~ ., data = train_data_fold, num.trees = 500, verbose = TRUE, importance = "impurity", oob.error = TRUE)
Type: Regression
Number of trees: 500
Sample size: 126773
Number of independent variables: 43
Mtry: 6
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 8.719473
R squared (OOB): 0.518891
[[4]]
Ranger result
Call:
ranger(formula = int_rate ~ ., data = train_data_fold, num.trees = 500, verbose = TRUE, importance = "impurity", oob.error = TRUE)
Type: Regression
Number of trees: 500
Sample size: 126772
Number of independent variables: 43
Mtry: 6
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 8.76257
R squared (OOB): 0.5168671
[[5]]
Ranger result
Call:
ranger(formula = int_rate ~ ., data = train_data_fold, num.trees = 500, verbose = TRUE, importance = "impurity", oob.error = TRUE)
Type: Regression
Number of trees: 500
Sample size: 126773
Number of independent variables: 43
Mtry: 6
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 8.769394
R squared (OOB): 0.5183893
print(rf.k5.results)
# reshape data for plotting
rf.k5.results_long <- melt(rf.k5.results, id.vars = 'Fold')
# plot for each metric
p1 <- plot_metric(rf.k5.results_long, 'MSE')
p2 <- plot_metric(rf.k5.results_long, 'RMSE')
p3 <- plot_metric(rf.k5.results_long, 'MAE')
p4 <- plot_metric(rf.k5.results_long, 'OOB')
# arrange the plots in a 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
plot(p1)
plot(p2)
plot(p3)
plot(p4)
K fold using K=5 and Boosting:
#### Boosting applying Cross Validation with k=5 ####
# initialize lists to store models and their results
xgb.k5.models <- list()
xgb.k5.results <- data.frame()
# perform k-fold cross-validation
for(i in seq_along(folds)) {
# split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# prepare data for xgboost
xgb.y_train_fold <- train_data_fold$int_rate
xgb.X_train_fold <- as.matrix(train_data_fold[, -which(names(train_data_fold) == 'int_rate')])
xgb.y_test_fold <- test_data_fold$int_rate
xgb.X_test_fold <- as.matrix(test_data_fold[, -which(names(test_data_fold) == 'int_rate')])
# fit the xgboost model on the training fold
xgb.k5 <- xgboost(
data = xgb.X_train_fold,
label = xgb.y_train_fold,
nrounds = 100,
verbose = 0
)
xgb.k5.models[[i]] <- xgb.k5 # store the model
# make predictions on the training fold
xgb.k5.train_predictions <- predict(xgb.k5, newdata = xgb.X_train_fold)
# make predictions on the testing fold
xgb.k5.test_predictions <- predict(xgb.k5, newdata = xgb.X_test_fold)
# calculate metrics for training fold
xgb.k5.train_mse <- mean((xgb.k5.train_predictions - train_data_fold$int_rate)^2)
xgb.k5.train_rmse <- sqrt(xgb.k5.train_mse)
xgb.k5.train_mae <- mean(abs(xgb.k5.train_predictions - train_data_fold$int_rate))
xgb.k5.train_r2 <- 1 - (sum((xgb.y_train_fold - xgb.k5.train_predictions)^2) / sum((xgb.y_train_fold - mean(xgb.y_train_fold))^2))
# calculate metrics for testing fold
xgb.k5.test_mse <- mean((xgb.k5.test_predictions - xgb.y_test_fold)^2)
xgb.k5.test_rmse <- sqrt(xgb.k5.test_mse)
xgb.k5.test_mae <- mean(abs(xgb.k5.test_predictions - xgb.y_test_fold))
xgb.k5.test_r2 <- 1 - (sum((xgb.y_test_fold - xgb.k5.test_predictions)^2) / sum((xgb.y_test_fold - mean(xgb.y_test_fold))^2))
# store metrics in the results dataframe
xgb.k5.results <- rbind(xgb.k5.results, data.frame(
Fold = i,
Train_MSE = xgb.k5.train_mse, Test_MSE = xgb.k5.test_mse,
Train_RMSE = xgb.k5.train_rmse, Test_RMSE = xgb.k5.test_rmse,
Train_MAE = xgb.k5.train_mae, Test_MAE = xgb.k5.test_mae,
Train_R2 = xgb.k5.train_r2, Test_R2 = xgb.k5.test_r2
))
}
# display the models and their metrics
print(xgb.k5.models)
[[1]]
##### xgb.Booster
raw: 446 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[2]]
##### xgb.Booster
raw: 450 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[3]]
##### xgb.Booster
raw: 449.8 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[4]]
##### xgb.Booster
raw: 452.3 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[5]]
##### xgb.Booster
raw: 449.9 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
print(xgb.k5.results)
# reshape data for plotting
xgb.k5.results_long <- melt(xgb.k5.results, id.vars = 'Fold')
# plot for each metric
p1 <- plot_metric(xgb.k5.results_long, 'MSE')
p2 <- plot_metric(xgb.k5.results_long, 'RMSE')
p3 <- plot_metric(xgb.k5.results_long, 'MAE')
p4 <- plot_metric(xgb.k5.results_long, 'R2')
# arrange the plots in a 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
plot(p1)
plot(p2)
plot(p3)
plot(p4)
K fold using K=10:
# define the number of folds for cross-validation
num_folds <- 10
folds <- createFolds(train_data$int_rate, k = num_folds, list = TRUE)
K fold using K=10 and linear regression:
#### Linear Regresion applying Cross Validation with k=10 ####
# initialize lists to store models and their results
lm.k10.models <- list()
lm.k10.results <- data.frame()
# perform k-fold cross-validation
for(i in seq_along(folds)) {
# split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# fit the model on the training fold
lm.k10 <- lm(int_rate ~ ., data = train_data_fold)
lm.k10.models[[i]] <- lm.k10 # Store the model
# make predictions on the training and testing fold
lm.k10.train_predictions <- predict(lm.k10, newdata = train_data_fold)
lm.k10.test_predictions <- predict(lm.k10, newdata = test_data_fold)
# calculate metrics for training fold
lm.k10.train_mse <- mean((lm.k10.train_predictions - train_data_fold$int_rate)^2)
lm.k10.train_rmse <- sqrt(lm.k10.train_mse)
lm.k10.train_mae <- mean(abs(lm.k10.train_predictions - train_data_fold$int_rate))
lm.k10.train_r2 <- summary(lm.k10)$r.squared
# calculate metrics for testing fold
lm.k10.test_mse <- mean((lm.k10.test_predictions - test_data_fold$int_rate)^2)
lm.k10.test_rmse <- sqrt(lm.k10.test_mse)
lm.k10.test_mae <- mean(abs(lm.k10.test_predictions - test_data_fold$int_rate))
lm.k10.test_r2 <- 1 - (sum((test_data_fold$int_rate - lm.k10.test_predictions)^2) / sum((test_data_fold$int_rate - mean(test_data_fold$int_rate))^2))
# store metrics in the results dataframe
lm.k10.results <- rbind(lm.k10.results, data.frame(
Fold = i,
Train_MSE = lm.k10.train_mse, Test_MSE = lm.k10.test_mse,
Train_RMSE = lm.k10.train_rmse, Test_RMSE = lm.k10.test_rmse,
Train_MAE = lm.k10.train_mae, Test_MAE = lm.k10.test_mae,
Train_R2 = lm.k10.train_r2, Test_R2 = lm.k10.test_r2
))
}
# display the models and their metrics
print(lm.k10.models)
[[1]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
1.778e+00 2.108e-05 3.798e+00
emp_length home_ownership annual_inc
6.977e-03 2.084e-01 -3.357e-06
verification_status purpose addr_state
7.231e-01 3.472e-01 2.614e-04
dti delinq_2yrs earliest_cr_line
5.258e-02 1.391e-02 2.000e-09
inq_last_6mths open_acc pub_rec
9.567e-01 3.993e-02 4.354e-01
revol_bal revol_util total_acc
-7.100e-06 4.768e-02 -2.891e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.045e+00 3.139e-01 -2.452e-05
dti_joint verification_status_joint acc_now_delinq
7.753e-02 6.308e-01 1.057e+00
tot_coll_amt tot_cur_bal open_acc_6m
1.854e-05 -1.900e-06 1.721e-01
open_il_6m open_il_12m open_il_24m
-1.299e-01 8.152e-01 -9.802e-02
total_bal_il il_util open_rv_12m
2.824e-06 4.530e-03 2.507e-01
open_rv_24m max_bal_bc all_util
-1.891e-02 -6.114e-05 -5.394e-03
total_rev_hi_lim inq_fi total_cu_tl
-7.678e-07 9.840e-02 -7.271e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
3.369e-02 -2.143e-01 -1.371e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.006e-01 -1.286e-01
[[2]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.901e+00 2.566e-05 3.833e+00
emp_length home_ownership annual_inc
2.054e-02 2.754e-01 -5.634e-06
verification_status purpose addr_state
7.527e-01 3.146e-01 8.603e-04
dti delinq_2yrs earliest_cr_line
2.668e-03 3.360e-02 1.824e-09
inq_last_6mths open_acc pub_rec
9.478e-01 8.448e-02 4.348e-01
revol_bal revol_util total_acc
7.358e-06 4.626e-02 -3.248e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-9.449e-01 2.852e-01 -3.859e-05
dti_joint verification_status_joint acc_now_delinq
2.706e-01 -2.522e-01 1.406e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.211e-05 -9.783e-07 -6.330e-03
open_il_6m open_il_12m open_il_24m
-1.926e-01 7.235e-01 9.869e-02
total_bal_il il_util open_rv_12m
5.355e-06 5.335e-03 1.552e-01
open_rv_24m max_bal_bc all_util
7.564e-02 -6.177e-05 -4.289e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.636e-05 -1.770e-02 -2.800e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
4.683e-02 -1.946e-01 -1.121e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.278e-01 -1.417e-01
[[3]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
3.638e+00 2.815e-05 3.748e+00
emp_length home_ownership annual_inc
1.227e-02 2.979e-01 -4.727e-06
verification_status purpose addr_state
7.383e-01 3.214e-01 9.522e-04
dti delinq_2yrs earliest_cr_line
4.631e-02 2.049e-02 1.742e-09
inq_last_6mths open_acc pub_rec
9.674e-01 6.375e-02 3.885e-01
revol_bal revol_util total_acc
5.724e-06 4.244e-02 -3.406e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.041e+00 4.686e-01 -5.396e-06
dti_joint verification_status_joint acc_now_delinq
4.760e-02 -8.511e-01 1.516e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.244e-05 -7.431e-07 -1.228e-01
open_il_6m open_il_12m open_il_24m
-1.050e-01 8.781e-01 -1.392e-02
total_bal_il il_util open_rv_12m
-1.389e-06 4.273e-03 2.565e-01
open_rv_24m max_bal_bc all_util
4.627e-02 -4.357e-05 -1.490e-04
total_rev_hi_lim inq_fi total_cu_tl
-1.673e-05 1.024e-01 -7.620e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
3.836e-02 -1.998e-01 -1.884e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.711e-01 -1.442e-01
[[4]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.295e+00 2.959e-05 3.795e+00
emp_length home_ownership annual_inc
1.892e-02 2.594e-01 -6.514e-06
verification_status purpose addr_state
7.459e-01 3.133e-01 -3.027e-05
dti delinq_2yrs earliest_cr_line
1.785e-03 2.504e-02 1.882e-09
inq_last_6mths open_acc pub_rec
9.314e-01 8.219e-02 3.286e-01
revol_bal revol_util total_acc
8.685e-06 4.601e-02 -2.954e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-9.616e-01 5.323e-01 -6.292e-06
dti_joint verification_status_joint acc_now_delinq
-4.154e-02 4.494e-01 1.095e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.995e-05 -1.247e-06 -7.437e-02
open_il_6m open_il_12m open_il_24m
-1.234e-01 7.278e-01 9.718e-02
total_bal_il il_util open_rv_12m
2.178e-06 7.526e-03 3.600e-01
open_rv_24m max_bal_bc all_util
2.387e-03 -1.382e-06 -8.134e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.639e-05 1.741e-01 -6.909e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
6.917e-02 -2.081e-01 -2.069e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
3.135e-01 -1.182e-01
[[5]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.998e+00 3.093e-05 3.756e+00
emp_length home_ownership annual_inc
9.611e-03 2.404e-01 -2.415e-06
verification_status purpose addr_state
6.917e-01 3.243e-01 2.593e-04
dti delinq_2yrs earliest_cr_line
5.377e-02 1.772e-02 1.827e-09
inq_last_6mths open_acc pub_rec
9.168e-01 6.338e-02 4.192e-01
revol_bal revol_util total_acc
4.110e-06 4.327e-02 -3.438e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.002e+00 1.247e-01 6.189e-06
dti_joint verification_status_joint acc_now_delinq
-6.133e-02 -2.462e-01 7.604e-01
tot_coll_amt tot_cur_bal open_acc_6m
4.426e-05 -1.149e-06 6.598e-03
open_il_6m open_il_12m open_il_24m
-1.448e-01 3.746e-01 1.124e-01
total_bal_il il_util open_rv_12m
3.174e-07 5.593e-03 1.588e-01
open_rv_24m max_bal_bc all_util
2.754e-02 -6.245e-05 -2.970e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.691e-05 -5.206e-02 -1.046e-01
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
9.716e-02 -2.083e-01 -9.049e-02
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
1.455e-01 -1.310e-01
[[6]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.067e-02 3.412e-05 3.694e+00
emp_length home_ownership annual_inc
1.364e-02 2.492e-01 -6.128e-06
verification_status purpose addr_state
6.959e-01 3.290e-01 -2.169e-04
dti delinq_2yrs earliest_cr_line
4.054e-02 5.693e-02 1.741e-09
inq_last_6mths open_acc pub_rec
9.554e-01 7.110e-02 4.577e-01
revol_bal revol_util total_acc
7.136e-06 4.279e-02 -3.529e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.014e+00 2.466e-01 9.446e-06
dti_joint verification_status_joint acc_now_delinq
-3.202e-01 2.229e+00 1.390e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.225e-05 -8.418e-07 7.038e-03
open_il_6m open_il_12m open_il_24m
-2.195e-01 6.524e-01 1.341e-01
total_bal_il il_util open_rv_12m
3.279e-06 4.687e-03 2.853e-01
open_rv_24m max_bal_bc all_util
6.596e-03 -6.769e-05 4.724e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.697e-05 -4.337e-02 -4.166e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
1.036e-01 -1.706e-01 -1.246e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
3.146e-01 -1.415e-01
[[7]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
1.825e+00 3.079e-05 3.741e+00
emp_length home_ownership annual_inc
2.133e-02 2.608e-01 -4.716e-06
verification_status purpose addr_state
7.121e-01 3.155e-01 4.318e-04
dti delinq_2yrs earliest_cr_line
4.771e-02 4.869e-02 1.835e-09
inq_last_6mths open_acc pub_rec
9.860e-01 6.803e-02 3.876e-01
revol_bal revol_util total_acc
6.986e-06 4.247e-02 -3.654e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.023e+00 1.999e-01 -1.387e-05
dti_joint verification_status_joint acc_now_delinq
2.978e-02 4.312e-01 1.502e+00
tot_coll_amt tot_cur_bal open_acc_6m
4.461e-05 -1.036e-06 1.088e-01
open_il_6m open_il_12m open_il_24m
-1.352e-01 7.271e-01 6.745e-02
total_bal_il il_util open_rv_12m
2.335e-06 3.981e-03 1.189e-01
open_rv_24m max_bal_bc all_util
8.138e-02 -6.028e-05 -1.924e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.640e-05 1.844e-02 -8.906e-03
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
7.226e-02 -1.914e-01 -1.429e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.999e-01 -1.458e-01
[[8]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.037e+00 2.574e-05 3.798e+00
emp_length home_ownership annual_inc
1.503e-02 2.297e-01 -4.558e-06
verification_status purpose addr_state
7.240e-01 3.290e-01 -7.392e-04
dti delinq_2yrs earliest_cr_line
4.611e-02 6.752e-02 1.974e-09
inq_last_6mths open_acc pub_rec
9.740e-01 3.992e-02 4.222e-01
revol_bal revol_util total_acc
-9.588e-06 4.823e-02 -2.884e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-1.051e+00 3.011e-01 -4.061e-06
dti_joint verification_status_joint acc_now_delinq
3.964e-03 -6.782e-02 1.081e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.240e-05 -1.498e-06 -3.156e-02
open_il_6m open_il_12m open_il_24m
-1.191e-01 6.820e-01 4.732e-02
total_bal_il il_util open_rv_12m
3.521e-06 4.768e-03 1.401e-01
open_rv_24m max_bal_bc all_util
9.578e-02 -6.459e-05 -2.557e-03
total_rev_hi_lim inq_fi total_cu_tl
5.959e-08 6.987e-02 -2.691e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
9.257e-02 -1.942e-01 -1.746e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
3.119e-01 -1.327e-01
[[9]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
2.355e+00 3.492e-05 3.748e+00
emp_length home_ownership annual_inc
1.737e-02 2.636e-01 -5.943e-06
verification_status purpose addr_state
7.067e-01 3.203e-01 6.919e-05
dti delinq_2yrs earliest_cr_line
4.886e-02 6.105e-02 1.851e-09
inq_last_6mths open_acc pub_rec
9.845e-01 5.654e-02 3.315e-01
revol_bal revol_util total_acc
2.492e-06 4.419e-02 -3.310e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-9.784e-01 2.698e-01 7.044e-06
dti_joint verification_status_joint acc_now_delinq
4.701e-02 -3.011e-01 1.528e+00
tot_coll_amt tot_cur_bal open_acc_6m
3.814e-05 -8.883e-07 -1.165e-01
open_il_6m open_il_12m open_il_24m
-1.696e-01 9.205e-01 6.127e-02
total_bal_il il_util open_rv_12m
4.596e-06 6.000e-03 2.665e-01
open_rv_24m max_bal_bc all_util
6.598e-02 -1.976e-05 -5.193e-03
total_rev_hi_lim inq_fi total_cu_tl
-1.410e-05 -7.860e-02 -3.716e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
1.172e-01 -1.987e-01 -1.974e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
3.706e-01 -1.402e-01
[[10]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt term
1.451e+00 3.071e-05 3.727e+00
emp_length home_ownership annual_inc
1.764e-02 2.607e-01 -4.339e-06
verification_status purpose addr_state
7.309e-01 3.389e-01 5.991e-04
dti delinq_2yrs earliest_cr_line
4.680e-02 2.040e-02 1.715e-09
inq_last_6mths open_acc pub_rec
9.219e-01 6.120e-02 3.462e-01
revol_bal revol_util total_acc
8.017e-06 4.183e-02 -3.057e-02
initial_list_status collections_12_mths_ex_med annual_inc_joint
-9.882e-01 4.758e-01 -7.030e-06
dti_joint verification_status_joint acc_now_delinq
-5.550e-02 1.305e+00 1.684e+00
tot_coll_amt tot_cur_bal open_acc_6m
2.780e-05 -1.084e-06 1.940e-02
open_il_6m open_il_12m open_il_24m
-1.582e-01 8.023e-01 -2.529e-02
total_bal_il il_util open_rv_12m
2.130e-06 2.553e-03 2.229e-01
open_rv_24m max_bal_bc all_util
7.609e-03 -6.233e-05 5.916e-04
total_rev_hi_lim inq_fi total_cu_tl
-1.714e-05 5.684e-02 1.032e-02
inq_last_12m mths_since_delinq_cat mths_since_last_record_cat
9.648e-02 -2.010e-01 -2.017e-01
mths_since_rcnt_il_cat mths_since_last_major_derog_cat
2.745e-01 -1.373e-01
print(lm.k10.results)
plot_metric <- function(results_long, metric) {
# adjust the variable names based on the metric
variables <- if (metric == "OOB") {
"OOB_Error"
} else {
c(paste0('Train_', metric), paste0('Test_', metric))
}
title <- if (metric == "OOB") {
paste0(metric, ' per Fold')
} else {
paste0('Train vs Test ', metric, ' per Fold')
}
ggplot(results_long[results_long$variable %in% variables, ],
aes(x = Fold, y = value, color = variable)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = title,
x = 'Fold',
y = metric)
}
# reshape data for plotting
lm.k10.results_long <- melt(lm.k10.results, id.vars = 'Fold')
# plot for each metric
p1 <- plot_metric(lm.k10.results_long, 'MSE')
p2 <- plot_metric(lm.k10.results_long, 'RMSE')
p3 <- plot_metric(lm.k10.results_long, 'MAE')
p4 <- plot_metric(lm.k10.results_long, 'R2')
# arrange the plots in a 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
plot(p1)
plot(p2)
plot(p3)
plot(p4)
K fold using K=10 and Random Forest:
# #### Random Forest applying Cross Validation with k=10 ####
#
# # initialize lists to store models and their results
# rf.k10.models <- list()
# rf.k10.results <- data.frame()
#
# # perform k-fold cross-validation
# for(i in seq_along(folds)) {
# # split the data into training and testing for the current fold
# train_indices <- folds[[i]]
# test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
#
# train_data_fold <- train_data[train_indices, ]
# test_data_fold <- train_data[test_indices, ]
#
# # fit the model on the training fold
# rf.k10 <- ranger(formula = int_rate ~ ., data = train_data, num.trees = 500, verbose=TRUE, importance = "impurity", oob.error = TRUE)
# rf.k10.models[[i]] <- rf.k10 # Store the model
#
# # make predictions on the training and testing fold
# rf.k10.train_predictions <- predict(rf.k10, data = train_data_fold)$predictions
# rf.k10.test_predictions <- predict(rf.k10, data = test_data_fold)$predictions
#
# # calculate metrics for training fold
# rf.k10.train_mse <- mean((rf.k10.train_predictions - train_data_fold$int_rate)^2)
# rf.k10.train_rmse <- sqrt(rf.k10.train_mse)
# rf.k10.train_mae <- mean(abs(rf.k10.train_predictions - train_data_fold$int_rate))
# rf.k10.oob_error <- rf.k10$prediction.error
#
# # calculate metrics for testing fold
# rf.k10.test_mse <- mean((rf.k10.test_predictions - test_data_fold$int_rate)^2)
# rf.k10.test_rmse <- sqrt(rf.k10.test_mse)
# rf.k10.test_mae <- mean(abs(rf.k10.test_predictions - test_data_fold$int_rate))
# rf.k10.test_r2 <- 1 - (sum((test_data_fold$int_rate - rf.k10.test_predictions)^2) / sum((test_data_fold$int_rate - mean(test_data_fold$int_rate))^2))
#
# # store metrics in the results dataframe
# rf.k10.results <- rbind(rf.k10.results, data.frame(
# Fold = i,
# Train_MSE = rf.k10.train_mse, Test_MSE = rf.k10.test_mse,
# Train_RMSE = rf.k10.train_rmse, Test_RMSE = rf.k10.test_rmse,
# Train_MAE = rf.k10.train_mae, Test_MAE = rf.k10.test_mae,
# OOB_Error = rf.k10.oob_error
# ))
# }
#
# # display the models and their metrics
# print(rf.k10.models)
# print(rf.k10.results)
# reshape data for plotting
# rf.k10.results_long <- melt(rf.k10.results, id.vars = 'Fold')
#
# # plot for each metric
# p1 <- plot_metric(rf.k10.results_long, 'MSE')
# p2 <- plot_metric(rf.k10.results_long, 'RMSE')
# p3 <- plot_metric(rf.k10.results_long, 'MAE')
# p4 <- plot_metric(rf.k10.results_long, 'OOB')
#
# # arrange the plots in a 2x2 grid
# grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
#
# plot(p1)
# plot(p2)
# plot(p3)
# plot(p4)
K fold using K=10 and Boosting:
#### Boosting applying Cross Validation with k=10 ####
# initialize lists to store models and their results
xgb.k10.models <- list()
xgb.k10.results <- data.frame()
# perform k-fold cross-validation
for(i in seq_along(folds)) {
# split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# prepare data for xgboost
xgb.y_train_fold <- train_data_fold$int_rate
xgb.X_train_fold <- as.matrix(train_data_fold[, -which(names(train_data_fold) == 'int_rate')])
xgb.y_test_fold <- test_data_fold$int_rate
xgb.X_test_fold <- as.matrix(test_data_fold[, -which(names(test_data_fold) == 'int_rate')])
# fit the xgboost model on the training fold
xgb.k10 <- xgboost(
data = xgb.X_train_fold,
label = xgb.y_train_fold,
nrounds = 100,
verbose = 0
)
xgb.k10.models[[i]] <- xgb.k10 # store the model
# make predictions on the training fold
xgb.k10.train_predictions <- predict(xgb.k10, newdata = xgb.X_train_fold)
# make predictions on the testing fold
xgb.k10.test_predictions <- predict(xgb.k10, newdata = xgb.X_test_fold)
# calculate metrics for training fold
xgb.k10.train_mse <- mean((xgb.k10.train_predictions - train_data_fold$int_rate)^2)
xgb.k10.train_rmse <- sqrt(xgb.k10.train_mse)
xgb.k10.train_mae <- mean(abs(xgb.k10.train_predictions - train_data_fold$int_rate))
xgb.k10.train_r2 <- 1 - (sum((xgb.y_train_fold - xgb.k10.train_predictions)^2) / sum((xgb.y_train_fold - mean(xgb.y_train_fold))^2))
# calculate metrics for testing fold
xgb.k10.test_mse <- mean((xgb.k10.test_predictions - xgb.y_test_fold)^2)
xgb.k10.test_rmse <- sqrt(xgb.k10.test_mse)
xgb.k10.test_mae <- mean(abs(xgb.k10.test_predictions - xgb.y_test_fold))
xgb.k10.test_r2 <- 1 - (sum((xgb.y_test_fold - xgb.k10.test_predictions)^2) / sum((xgb.y_test_fold - mean(xgb.y_test_fold))^2))
# store metrics in the results dataframe
xgb.k10.results <- rbind(xgb.k10.results, data.frame(
Fold = i,
Train_MSE = xgb.k10.train_mse, Test_MSE = xgb.k10.test_mse,
Train_RMSE = xgb.k10.train_rmse, Test_RMSE = xgb.k10.test_rmse,
Train_MAE = xgb.k10.train_mae, Test_MAE = xgb.k10.test_mae,
Train_R2 = xgb.k10.train_r2, Test_R2 = xgb.k10.test_r2
))
}
# display the models and their metrics
print(xgb.k10.models)
[[1]]
##### xgb.Booster
raw: 435.5 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[2]]
##### xgb.Booster
raw: 430.4 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[3]]
##### xgb.Booster
raw: 434 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[4]]
##### xgb.Booster
raw: 436 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[5]]
##### xgb.Booster
raw: 445.6 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[6]]
##### xgb.Booster
raw: 432.4 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[7]]
##### xgb.Booster
raw: 433.6 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[8]]
##### xgb.Booster
raw: 442.7 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[9]]
##### xgb.Booster
raw: 438.7 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
[[10]]
##### xgb.Booster
raw: 448.8 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 43
niter: 100
nfeatures : 43
evaluation_log:
print(xgb.k10.results)
# reshape data for plotting
xgb.k10.results_long <- melt(xgb.k10.results, id.vars = 'Fold')
# plot for each metric
p1 <- plot_metric(xgb.k10.results_long, 'MSE')
p2 <- plot_metric(xgb.k10.results_long, 'RMSE')
p3 <- plot_metric(xgb.k10.results_long, 'MAE')
p4 <- plot_metric(xgb.k10.results_long, 'R2')
# arrange the plots in a 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
plot(p1)
plot(p2)
plot(p3)
plot(p4)
Decision Trees
#### Decision Trees ####
# error in tree: "factor predictors must have at most 32 levels" is thrown
# basically, it becomes computationally expensive to create so many splits in your data, since you are selecting the best split out of all 2^32 (approx) possible splits
# The error above was solved with the factor and then numeric variable transformation
# fit a decision tree model on the training data
tm <- tree(int_rate ~ ., data = train_data)
# make predictions on the training and testing data
tm.train_predictions <- predict(tm, newdata = train_data)
tm.test_predictions <- predict(tm, newdata = test_data)
# calculate Mean Squared Error (MSE) for training and testing
tm.train_mse <- mean((tm.train_predictions - train_data$int_rate)^2)
tm.test_mse <- mean((tm.test_predictions - test_data$int_rate)^2)
# calculate Root Mean Squared Error (RMSE) for training and testing
tm.train_rmse <- sqrt(tm.train_mse)
tm.test_rmse <- sqrt(tm.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
tm.train_mae <- mean(abs(tm.train_predictions - train_data$int_rate))
tm.test_mae <- mean(abs(tm.test_predictions - test_data$int_rate))
# calculate R-squared (R²) for training and testing
tm.train_r2 <- 1 - (sum((train_data$int_rate - tm.train_predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
tm.test_r2 <- 1 - (sum((test_data$int_rate - tm.test_predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
# display the metrics
cat("Training MSE:", tm.train_mse, "\n")
Training MSE: 13.38487
cat("Testing MSE:", tm.test_mse, "\n")
Testing MSE: 13.4105
cat("Training RMSE:", tm.train_rmse, "\n")
Training RMSE: 3.658534
cat("Testing RMSE:", tm.test_rmse, "\n")
Testing RMSE: 3.662034
cat("Training MAE:", tm.train_mae, "\n")
Training MAE: 2.953433
cat("Testing MAE:", tm.test_mae, "\n")
Testing MAE: 2.953717
cat("Training R-squared (R²):", tm.train_r2, "\n")
Training R-squared (R²): 0.2619011
cat("Testing R-squared (R²):", tm.test_r2, "\n")
Testing R-squared (R²): 0.2614654
Random Forest
#### Random Forest ####
# train a Random Forest model
rf <- ranger(formula = int_rate ~ ., data = train_data, num.trees = 500, verbose=TRUE, importance = "impurity", oob.error = TRUE)
Growing trees.. Progress: 15%. Estimated remaining time: 2 minutes, 58 seconds.
Growing trees.. Progress: 31%. Estimated remaining time: 2 minutes, 21 seconds.
Growing trees.. Progress: 47%. Estimated remaining time: 1 minute, 47 seconds.
Growing trees.. Progress: 62%. Estimated remaining time: 1 minute, 15 seconds.
Growing trees.. Progress: 78%. Estimated remaining time: 44 seconds.
Growing trees.. Progress: 93%. Estimated remaining time: 13 seconds.
Boosting
#### Boosting ####
# define the target variable for training and testing
xgb.y_train <- train_data$int_rate
xgb.y_test <- test_data$int_rate
# define the feature matrix for training and testing (exclude the target variable)
xgb.X_train <- train_data[, -which(names(train_data) == 'int_rate')]
xgb.X_test <- test_data[, -which(names(test_data) == 'int_rate')]
# fit a gradient boosting regression model using xgboost
xgb <- xgboost(
data = as.matrix(xgb.X_train),
label = xgb.y_train,
nrounds = 100,
verbose = 0
)
# make predictions on the training and testing data
xgb.train_predictions <- predict(xgb, newdata = as.matrix(xgb.X_train))
xgb.test_predictions <- predict(xgb, newdata = as.matrix(xgb.X_test))
# calculate Mean Squared Error (MSE) for training and testing
xgb.train_mse <- mean((xgb.train_predictions - xgb.y_train)^2)
xgb.test_mse <- mean((xgb.test_predictions - xgb.y_test)^2)
# calculate Root Mean Squared Error (RMSE) for training and testing
xgb.train_rmse <- sqrt(xgb.train_mse)
xgb.test_rmse <- sqrt(xgb.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
xgb.train_mae <- mean(abs(xgb.train_predictions - xgb.y_train))
xgb.test_mae <- mean(abs(xgb.test_predictions - xgb.y_test))
# calculate R-squared (R²) for training and testing
xgb.train_r2 <- 1 - (sum((xgb.y_train - xgb.train_predictions)^2) / sum((xgb.y_train - mean(xgb.y_train))^2))
xgb.test_r2 <- 1 - (sum((xgb.y_test - xgb.test_predictions)^2) / sum((xgb.y_test - mean(xgb.y_test))^2))
# display the metrics
cat("Training MSE:", xgb.train_mse, "\n")
Training MSE: 7.430606
cat("Testing MSE:", xgb.test_mse, "\n")
Testing MSE: 7.697747
cat("Training RMSE:", xgb.train_rmse, "\n")
Training RMSE: 2.725914
cat("Testing RMSE:", xgb.test_rmse, "\n")
Testing RMSE: 2.774481
cat("Training MAE:", xgb.train_mae, "\n")
Training MAE: 2.147623
cat("Testing MAE:", xgb.test_mae, "\n")
Testing MAE: 2.185803
cat("Training R-squared (R²):", xgb.train_r2, "\n")
Training R-squared (R²): 0.5902447
cat("Testing R-squared (R²):", xgb.test_r2, "\n")
Testing R-squared (R²): 0.5760744
Following, a scatter plot of actual vs predicted training values for each model is plot. This plot helps us visualize how well each model’s predictions align with the actual data points.
# create a scatter plot function
create_scatter_plot <- function(actual_values, predicted_values, model_name) {
model_comparison_data <- data.frame(
Actual = actual_values,
Predicted = predicted_values
)
scatter_plot <- ggplot(model_comparison_data, aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # add a diagonal reference line
labs(x = "Actual Training Values", y = "Predicted Training Values", title = model_name) +
theme_minimal() +
ylim(-50, 50)
return(scatter_plot)
}
# create scatter plots for each model
lm_scatter_plot <- create_scatter_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_scatter_plot <- create_scatter_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_scatter_plot <- create_scatter_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# display the scatter plots separately
print(lm_scatter_plot)
print(rf_scatter_plot)
print(xgb_scatter_plot)
Following, a scatter plot of actual vs predicted testing values for each model is plot. This plot helps us visualize how well each model’s predictions align with the actual data points.
# create a scatter plot function
create_scatter_plot <- function(actual_values, predicted_values, model_name) {
model_comparison_data <- data.frame(
Actual = actual_values,
Predicted = predicted_values
)
scatter_plot <- ggplot(model_comparison_data, aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # add a diagonal reference line
labs(x = "Actual Testing Values", y = "Predicted Testing Values", title = model_name) +
theme_minimal() +
ylim(-50, 50) +
xlim(0, 40)
return(scatter_plot)
}
# create scatter plots for each model
lm_scatter_plot <- create_scatter_plot(
actual_values = test_data$int_rate,
predicted_values = lm.test_predictions,
model_name = "Linear Regression"
)
rf_scatter_plot <- create_scatter_plot(
actual_values = test_data$int_rate,
predicted_values = rf.test_predictions$predictions,
model_name = "Random Forest"
)
xgb_scatter_plot <- create_scatter_plot(
actual_values = xgb.y_test,
predicted_values = xgb.test_predictions,
model_name = "XGBoost"
)
# display the scatter plots separately
print(lm_scatter_plot)
print(rf_scatter_plot)
print(xgb_scatter_plot)
Residual plots can help identify patterns in prediction errors and assess whether the assumptions of linear regression (if applicable) are met.
# create a residual plot function
create_residual_plot <- function(actual_values, predicted_values, model_name) {
residuals <- actual_values - predicted_values
residual_data <- data.frame(
Predicted = predicted_values,
Residuals = residuals
)
residual_plot <- ggplot(residual_data, aes(x = Predicted, y = Residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") + # Red horizontal reference line
labs(x = "Predicted Values", y = "Residuals", title = paste("Residual Plot -", model_name)) +
theme_minimal() +
ylim(-30, 30) +
xlim(0, 40)
return(residual_plot)
}
# create residual plots for each model
lm_residual_plot <- create_residual_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_residual_plot <- create_residual_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_residual_plot <- create_residual_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# display the residual plots separately
print(lm_residual_plot)
# create a density plot function for residuals
create_residual_density_plot <- function(actual_values, predicted_values, model_name) {
residuals <- actual_values - predicted_values
residual_data <- data.frame(Residuals = residuals)
density_plot <- ggplot(residual_data, aes(x = Residuals)) +
geom_density(fill = "skyblue", color = "black", alpha = 0.7) +
labs(x = "Residuals", y = "Density", title = paste("Residual Density Plot -", model_name)) +
theme_minimal() +
xlim(-30,30) +
ylim(0, 0.35)
return(density_plot)
}
# create density plots for residuals for each model
lm_residual_density_plot <- create_residual_density_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_residual_density_plot <- create_residual_density_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_residual_density_plot <- create_residual_density_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# display the density plots separately
print(lm_residual_density_plot)
print(rf_residual_density_plot)
print(xgb_residual_density_plot)
This visualization can help you compare the distribution of prediction errors across models through histograms.
# create a histogram plot function for residuals with a red density curve
create_residual_histogram_plot <- function(actual_values, predicted_values, model_name) {
residuals <- actual_values - predicted_values
residual_data <- data.frame(Residuals = residuals)
histogram_plot <- ggplot(residual_data, aes(x = Residuals)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) + # use density on the y-axis for the histogram
geom_density(color = "red", linewidth = 1.5) + # add the density plot in red
labs(x = "Residuals", y = "Density", title = paste("Residual Histogram Plot with Density Curve -", model_name)) +
theme_minimal() +
xlim(-20,20) +
ylim(0, 0.3)
return(histogram_plot)
}
# create histogram plots for residuals for each model
lm_residual_histogram_plot <- create_residual_histogram_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_residual_histogram_plot <- create_residual_histogram_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_residual_histogram_plot <- create_residual_histogram_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# display the histogram plots separately
print(lm_residual_histogram_plot)
print(rf_residual_histogram_plot)
print(xgb_residual_histogram_plot)
For each model a bar chart that displays the R-squared (coefficient of determination) values is created. R-squared measures the proportion of variance in the target variable explained by the model. Higher R-squared values indicate better model fit.
# create a data frame with R-squared values for each model
model_names <- c("Linear Regression", "Random Forest", "XGBoost")
r_squared_values_train <- c(
lm.train_r2,
rf.train_r2,
xgb.train_r2
)
r_squared_values_test <- c(
lm.test_r2,
rf.test_r2,
xgb.test_r2
)
r_squared_data_train <- data.frame(Model = factor(model_names),
R_squared = r_squared_values_train)
r_squared_data_test <- data.frame(Model = factor(model_names),
R_squared = r_squared_values_test)
# create the R-squared comparison bar chart
r_squared_bar_chart_train <- ggplot(r_squared_data_train, aes(x = Model, y = R_squared, fill = Model)) +
geom_bar(stat = "identity") +
labs(x = "Model", y = "R-squared (R²)", title = "R-squared Comparison Training") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0,1)
r_squared_bar_chart_test <- ggplot(r_squared_data_test, aes(x = Model, y = R_squared, fill = Model)) +
geom_bar(stat = "identity") +
labs(x = "Model", y = "R-squared (R²)", title = "R-squared Comparison Testing") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0,1)
# display the R-squared comparison bar chart
print(r_squared_bar_chart_train)
print(r_squared_bar_chart_test)
A bar chart that compares the MAE or RMSE values, is generated for each model. These metrics quantify the average prediction errors of each model, and lower values are preferred.
# create a data frame with MAE and RMSE values for each model
model_names <- c("Linear Regression", "Random Forest", "XGBoost","Linear Regression", "Random Forest", "XGBoost")
error_values_train <- c(
lm.train_mae,
rf.train_mae,
xgb.train_mae,
lm.train_rmse,
rf.train_rmse,
xgb.train_rmse
)
error_values_test <- c(
lm.test_mae,
rf.test_mae,
xgb.test_mae,
lm.test_rmse,
rf.test_rmse,
xgb.test_rmse
)
error_type <- c(
"MAE", "MAE", "MAE","RMSE","RMSE","RMSE"
)
model_errors_train <- data.frame(Model = factor(model_names, levels = c("Linear Regression", "Random Forest", "XGBoost")),
Error = error_values_train, Type = error_type)
model_errors_test <- data.frame(Model = factor(model_names, levels = c("Linear Regression", "Random Forest", "XGBoost")),
Error = error_values_test, Type = error_type)
# create the MAE or RMSE comparison bar chart
error_bar_chart_train <- ggplot(model_errors_train, aes(x = Model, y = Error, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Model", y = "Error Value", title = "Training MAE and RMSE Comparison") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0, 4)
error_bar_chart_test <- ggplot(model_errors_test, aes(x = Model, y = Error, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Model", y = "Error Value", title = "Testing MAE and RMSE Comparison") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0, 4)
# display the MAE and RMSE comparison bar chart
print(error_bar_chart_train)
print(error_bar_chart_test)
#### Random Forest Feature Importance Plot ####
v1 <- vip(rf, title = "Ranger", num_features = 20)
plot(v1)
Feature Selection from the variable importance’s analysis:
imp.variables <- lc_data[, v1$data$Variable]
imp.variables$int_rate <- lc_data$int_rate
imp.train_indices <- createDataPartition(imp.variables$int_rate, p = 0.8, list = FALSE)
# create training and testing datasets
imp.train_data <- imp.variables[imp.train_indices, ]
imp.test_data <- imp.variables[-imp.train_indices, ]
#### Linear Regression with only importance variables ####
imp.lm.fit <- lm(int_rate ~ ., data = imp.train_data)
# make predictions on the training and testing data
imp.lm.train_predictions <- predict(imp.lm.fit, newdata = imp.train_data)
imp.lm.test_predictions <- predict(imp.lm.fit, newdata = imp.test_data)
# calculate Mean Squared Error (MSE) for training and testing
imp.lm.train_mse <- mean((imp.lm.train_predictions - imp.train_data$int_rate)^2)
imp.lm.test_mse <- mean((imp.lm.test_predictions - imp.test_data$int_rate)^2)
# calculate Root Mean Squared Error (RMSE) for training and testing
imp.lm.train_rmse <- sqrt(imp.lm.train_mse)
imp.lm.test_rmse <- sqrt(imp.lm.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
imp.lm.train_mae <- mean(abs(imp.lm.train_predictions - imp.train_data$int_rate))
imp.lm.test_mae <- mean(abs(imp.lm.test_predictions - imp.test_data$int_rate))
# calculate R-squared (R²) for training and testing
imp.lm.train_r2 <- 1 - (sum((imp.train_data$int_rate - imp.lm.train_predictions)^2) / sum((imp.train_data$int_rate - mean(imp.train_data$int_rate))^2))
imp.lm.test_r2 <- 1 - (sum((imp.test_data$int_rate - imp.lm.test_predictions)^2) / sum((imp.test_data$int_rate - mean(imp.test_data$int_rate))^2))
# display the metrics
cat("Training MSE:", imp.lm.train_mse, "\n")
Training MSE: 10.38227
cat("Testing MSE:", imp.lm.test_mse, "\n")
Testing MSE: 13.5138
cat("Training RMSE:", imp.lm.train_rmse, "\n")
Training RMSE: 3.222153
cat("Testing RMSE:", imp.lm.test_rmse, "\n")
Testing RMSE: 3.676112
cat("Training MAE:", imp.lm.train_mae, "\n")
Training MAE: 2.57012
cat("Testing MAE:", imp.lm.test_mae, "\n")
Testing MAE: 2.581021
cat("Training R-squared (R²):", imp.lm.train_r2, "\n")
Training R-squared (R²): 0.4272634
cat("Testing R-squared (R²):", imp.lm.test_r2, "\n")
Testing R-squared (R²): 0.2568865
#### Random Forest with only importance variables ####
# train a Random Forest model
imp.rf <- ranger(formula = int_rate ~ ., data = imp.train_data, num.trees = 500, verbose=TRUE, importance = "impurity", oob.error = TRUE)
Growing trees.. Progress: 21%. Estimated remaining time: 2 minutes, 0 seconds.
Growing trees.. Progress: 43%. Estimated remaining time: 1 minute, 23 seconds.
Growing trees.. Progress: 65%. Estimated remaining time: 52 seconds.
Growing trees.. Progress: 86%. Estimated remaining time: 20 seconds.
# print the model summary
print("Random Forest Model Summary:")
[1] "Random Forest Model Summary:"
print(imp.rf)
Ranger result
Call:
ranger(formula = int_rate ~ ., data = imp.train_data, num.trees = 500, verbose = TRUE, importance = "impurity", oob.error = TRUE)
Type: Regression
Number of trees: 500
Sample size: 633865
Number of independent variables: 20
Mtry: 4
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 8.372169
R squared (OOB): 0.538151
# make predictions on the training and testing data
imp.rf.train_predictions <- predict(imp.rf, data = imp.train_data)
imp.rf.test_predictions <- predict(imp.rf, data = imp.test_data)
# calculate Mean Squared Error (MSE) for training and testing
imp.rf.train_mse <- mean((imp.rf.train_predictions$predictions - imp.train_data$int_rate)^2)
imp.rf.test_mse <- mean((imp.rf.test_predictions$predictions - imp.test_data$int_rate)^2)
# calculate Root Mean Squared Error (RMSE) for training and testing
imp.rf.train_rmse <- sqrt(imp.rf.train_mse)
imp.rf.test_rmse <- sqrt(imp.rf.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
imp.rf.train_mae <- mean(abs(imp.rf.train_predictions$predictions - imp.train_data$int_rate))
imp.rf.test_mae <- mean(abs(imp.rf.test_predictions$predictions - imp.test_data$int_rate))
# calculate R-squared (R²) for training and testing
imp.rf.train_r2 <- 1 - (sum((imp.train_data$int_rate - imp.rf.train_predictions$predictions)^2) / sum((imp.train_data$int_rate - mean(imp.train_data$int_rate))^2))
imp.rf.test_r2 <- 1 - (sum((test_data$int_rate - rf.test_predictions$predictions)^2) / sum((imp.test_data$int_rate - mean(imp.test_data$int_rate))^2))
# display the metrics
cat("Training MSE:", imp.rf.train_mse, "\n")
Training MSE: 1.63843
cat("Testing MSE:", imp.rf.test_mse, "\n")
Testing MSE: 8.36116
cat("Training RMSE:", imp.rf.train_rmse, "\n")
Training RMSE: 1.280012
cat("Testing RMSE:", imp.rf.test_rmse, "\n")
Testing RMSE: 2.891567
cat("Training MAE:", imp.rf.train_mae, "\n")
Training MAE: 1.005422
cat("Testing MAE:", imp.rf.test_mae, "\n")
Testing MAE: 2.296878
cat("Training R-squared (R²):", imp.rf.train_r2, "\n")
Training R-squared (R²): 0.9096162
cat("Testing R-squared (R²):", imp.rf.test_r2, "\n")
Testing R-squared (R²): 0.5410509
#### Boosting with only importance variables ####
# define the target variable for training and testing
imp.xgb.y_train <- imp.train_data$int_rate
imp.xgb.y_test <- imp.test_data$int_rate
# define the feature matrix for training and testing (exclude the target variable)
imp.xgb.X_train <- imp.train_data[, -which(names(imp.train_data) == 'int_rate')]
imp.xgb.X_test <- imp.test_data[, -which(names(imp.test_data) == 'int_rate')]
# fit a gradient boosting regression model using xgboost
imp.xgb <- xgboost(
data = as.matrix(imp.xgb.X_train),
label = imp.xgb.y_train,
nrounds = 100,
verbose = 0
)
# make predictions on the training and testing data
imp.xgb.train_predictions <- predict(imp.xgb, newdata = as.matrix(imp.xgb.X_train))
imp.xgb.test_predictions <- predict(imp.xgb, newdata = as.matrix(imp.xgb.X_test))
# calculate Mean Squared Error (MSE) for training and testing
imp.xgb.train_mse <- mean((imp.xgb.train_predictions - imp.xgb.y_train)^2)
imp.xgb.test_mse <- mean((imp.xgb.test_predictions - imp.xgb.y_test)^2)
# calculate Root Mean Squared Error (RMSE) for training and testing
imp.xgb.train_rmse <- sqrt(imp.xgb.train_mse)
imp.xgb.test_rmse <- sqrt(imp.xgb.test_mse)
# calculate Mean Absolute Error (MAE) for training and testing
imp.xgb.train_mae <- mean(abs(imp.xgb.train_predictions - imp.xgb.y_train))
imp.xgb.test_mae <- mean(abs(imp.xgb.test_predictions - imp.xgb.y_test))
# calculate R-squared (R²) for training and testing
imp.xgb.train_r2 <- 1 - (sum((imp.xgb.y_train - imp.xgb.train_predictions)^2) / sum((imp.xgb.y_train - mean(imp.xgb.y_train))^2))
imp.xgb.test_r2 <- 1 - (sum((imp.xgb.y_test - imp.xgb.test_predictions)^2) / sum((imp.xgb.y_test - mean(imp.xgb.y_test))^2))
# display the metrics
cat("Training MSE:", imp.xgb.train_mse, "\n")
Training MSE: 7.463161
cat("Testing MSE:", imp.xgb.test_mse, "\n")
Testing MSE: 7.790172
cat("Training RMSE:", imp.xgb.train_rmse, "\n")
Training RMSE: 2.731879
cat("Testing RMSE:", imp.xgb.test_rmse, "\n")
Testing RMSE: 2.791088
cat("Training MAE:", imp.xgb.train_mae, "\n")
Training MAE: 2.154894
cat("Testing MAE:", imp.xgb.test_mae, "\n")
Testing MAE: 2.198685
cat("Training R-squared (R²):", imp.xgb.train_r2, "\n")
Training R-squared (R²): 0.5882955
cat("Testing R-squared (R²):", imp.xgb.test_r2, "\n")
Testing R-squared (R²): 0.5716244
The dataset was filtered by the 20 variables with the most importance (from the rf results). As we can see above, the errors of each model are more or less the errors with the double variables we had before, so filtering by these 20 “important variables” does not seem making sense…
Hyperparameter Tuning for XGBoosting:
# define the number of cores
numCores <- detectCores() - 1
# register doParallel as the backend for parallel execution
registerDoParallel(cores=numCores)
# define the control using a cross-validation approach
train_control <- trainControl(method = "cv", number = 5, verboseIter = TRUE)
# define the grid of hyperparameters to search over
xgb.grid <- expand.grid(
nrounds = c(100, 200, 300),
eta = c(0.01, 0.05, 0.1),
max_depth = c(3, 6, 9),
gamma = c(0, 0.1, 0.2),
colsample_bytree = c(0.5, 0.8, 1),
min_child_weight = c(1, 5, 10),
subsample = c(0.5, 0.75, 1)
)
# train the model
xgb.tuned <- train(
x = train_data, y = xgb.y_train,
method = "xgbTree",
trControl = train_control,
tuneGrid = xgb.grid
)
# view the best tuning parameters
print(xgb.tuned$bestTune)
# stop the parallel backend
stopImplicitCluster()
Saving our best model:
saveRDS(xgb, file = "best_model_xgb.rds")